library(tidyverse) # for graphing and data cleaning
library(tidymodels) # for modeling
library(stacks) # for stacking models
library(naniar) # for analyzing missing values
library(lubridate) # for date manipulation
library(moderndive) # for King County housing data
library(vip) # for variable importance plots
library(DALEX) # for model interpretation
library(DALEXtra) # for extension of DALEX
library(patchwork) # for combining plots nicely
data("lending_club")
Here is my GitHub link.
We’ll be using the lending_club dataset from the modeldata library, which is part of tidymodels. The outcome we are interested in predicting is Class. And according to the dataset’s help page, its values are “either ‘good’ (meaning that the loan was fully paid back or currently on-time) or ‘bad’ (charged off, defaulted, of 21-120 days late)”.
lending_club %>%
ggplot(aes(x = funded_amnt)) +
geom_density() +
facet_wrap(vars(Class))

lending_club %>%
ggplot(aes(x = int_rate)) +
geom_density() +
facet_wrap(vars(Class))

lending_club %>%
ggplot(aes(x = annual_inc)) +
geom_density() +
facet_wrap(vars(Class))

lending_club %>%
ggplot(aes(x = addr_state, fill = Class)) +
geom_bar(position = "fill")

lending_club %>%
count(Class)
lending_club %>%
group_by(addr_state) %>%
summarize(count = n()) %>%
arrange(desc(count))
create_more_bad <- lending_club %>%
filter(Class == "bad") %>%
sample_n(size = 3000, replace = TRUE)
lending_club_mod <- lending_club %>%
bind_rows(create_more_bad)
set.seed(494)
lending_split <- initial_split(lending_club_mod,
prop = 0.75)
lending_train <- training(lending_split)
lending_test <- testing(lending_split)
step_mutate_at() or this will be a lot of code). We’ll want to do this for the model interpretation we’ll do later.lending_recipe <- recipe(Class ~ .,
data = lending_train) %>%
step_rm(acc_now_delinq, delinq_amnt) %>%
step_mutate_at(all_numeric(),
fn = ~as.numeric(.)) %>%
# step_mutate(annual_inc =
# case_when(annual_inc <= 9875 ~ 10,
# annual_inc > 9875 && annual_inc <= 40125 ~ 12,
# annual_inc > 40125 && annual_inc <= 85525 ~ 22,
# annual_inc > 85525 && annual_inc <= 163300 ~ 24,
# annual_inc > 163300 && annual_inc <= 207350 ~ 32,
# annual_inc > 207350 && annual_inc <= 518400 ~ 35,
# annual_inc > 518400 ~ 37)) %>%
# step_mutate(annual_inc = as.factor(annual_inc)) %>%
step_normalize(all_predictors(), -all_nominal()) %>%
step_dummy(all_nominal(), -all_outcomes())
lending_recipe %>%
prep(lending_train) %>%
juice()